Approach: Major cell types (level 3) vs Mostafavi vs MF

Previous script: 01_prepare_input.R

# Reference: Dataset 01 
# Test: Dataset 02
# Rationale: Are the modules from the **Reference** networks preserved in the **Test** data set?

### SET PARAMETERS

snakemake_sn_input_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_singlenuclei/2nd_pass/snakemake-sn/input/"
snakemake_sn_output_dir = "/pastel/projects/speakeasy_dlpfc/SpeakEasy_singlenuclei/2nd_pass/snakemake-sn/results/"
cluster_lv = "3"
numthreads = 2

cell_ids = c("ast", "mic", "oli", "end", "opc", "ext", "inh", "Sara", "bulk_MF")
nSets = length(cell_ids)
# Heavy chunk. Calculate pairwise module preservation stats
multiExpr = list()
multiColor = list()
for (set in 1:nSets) {
  message(paste("Working on set:", cell_ids[set]))
  # Residuals of the expression: 
  res_dataset_01 = read.table(paste0(snakemake_sn_input_dir, cell_ids[set], ".txt"), header = T, check.names = F, stringsAsFactors = F)
  res_dataset_01 = as.data.frame(t(res_dataset_01))
  # clusters from SpeakEasy: 
  k_dataset_01 = read.table(paste0(snakemake_sn_output_dir, cell_ids[set], "/geneBycluster.txt"), header = T, stringsAsFactors = F) 
  k_dataset_01$ensembl = gsub("(.*)\\.(.*)","\\1",k_dataset_01$ensembl)
  # Match and sort genes in Reference modules 
  k_dataset_01.match = k_dataset_01 %>% filter(ensembl %in% colnames(res_dataset_01))
  k_dataset_01.match = k_dataset_01.match[match(colnames(res_dataset_01), k_dataset_01.match$ensembl),]
  
  multiExpr[[set]] = list(data = res_dataset_01)
  multiColor[[set]] = paste0("M", k_dataset_01.match[,paste0("cluster_lv",cluster_lv)])
}
names(multiExpr) = cell_ids
names(multiColor) = cell_ids

# Here comes the calculation of module preservation, it takes a week for the SN dataset 
enableWGCNAThreads(nThreads = numthreads)
system.time( {
mp = modulePreservation(multiExpr, multiColor, 
                        dataIsExpr = T,
                        networkType = "unsigned", # default: unsigned 
                        referenceNetworks = c(1:nSets), 
                        maxModuleSize = 20000,
                        maxGoldModuleSize = 500, 
                        nPermutations = 200, # Default = 200
                        quickCor = 1,
                        randomSeed = 2022,
                        parallelCalculation = T,
                        verbose = 4)
} )

# Save the results
save(mp, file = paste0(work_dir, "mp_sn_mf_sara.RData"))
load(file = paste0(work_dir, "mp_sn_mf_sara.RData"))
cell_ids = c("ast", "mic", "oli", "end", "opc", "ext", "inh", "Sara", "bulk_MF")
nSets = length(cell_ids)

res_all = data.frame()
for (ref in 1:nSets) {
  for (test in 1:nSets) {
    if(ref!=test){
      statsObs = cbind(mp$quality$observed[[ref]][[test]][,-1], mp$preservation$observed[[ref]][[test]][,-1])
      statsObs$module_id = rownames(statsObs)
  
      statsZ = cbind(mp$quality$Z[[ref]][[test]], mp$preservation$Z[[ref]][[test]][,-1])
      statsZ$module_id = rownames(statsZ)
  
      res_df = statsObs %>% dplyr::left_join(statsZ, by = c("module_id"))
      res_df$ref = cell_ids[ref]
      res_df$test = cell_ids[test]
      rownames(res_df) = NULL
      
      #res_df2 = cbind(statsObs[, c("medianRank.pres", "medianRank.qual")], signif(statsZ[, c("Zsummary.pres", "Zsummary.qual")], 2))
      #res_df2$ref = cell_ids[ref]
      #res_df2$test = cell_ids[test]
      
      res_all = bind_rows(res_all,res_df)
    }
  }
}

res_final = res_all[,c("module_id","moduleSize","ref","test",colnames(res_all)[!colnames(res_all) %in% c("module_id","moduleSize","ref","test")])]
createDT(res_final)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
p <- ggplot(res_final %>% filter(moduleSize >= 30), aes(x = moduleSize, y = Zsummary.pres )) + 
  geom_hline(yintercept = c(2,10), linetype = "dashed") +
  geom_point(shape = 21) +
  geom_label_repel(aes(label = module_id), max.overlaps = 25) +
#  geom_label(aes(label = module_id)) +
  labs(y = "Preservation Zsummary", x = "Module size", title = "Preservation Zsummary") +
  theme_classic() +
  facet_wrap(~ref+test, scales = "free", ncol = 7)

# pdf(file = paste0(work_dir, "h_fisher_mp_majorcells_sn.pdf"), width = 30, height = 40)
# p
# dev.off()

p

#res_final %>% select(module_id,ref,test,Zsummary.pres) %>% mutate(id = paste(ref,module_id,sep = "_")) %>%
#  pivot_wider(id_cols = id, names_from = test, values_from = Zsummary.pres)

res_mtx_sum = res_final %>% filter(moduleSize >= 30) %>% select(module_id,ref,test,Zsummary.pres) %>% mutate(id = paste(ref,module_id,sep = "_")) %>%
  pivot_wider(id_cols = ref, names_from = test, values_from = Zsummary.pres, values_fn = function(x) sum(x<2, na.rm = T)) %>% column_to_rownames("ref")
res_mtx_sum = res_mtx_sum[,rownames(res_mtx_sum)]

suppressPackageStartupMessages(library(ComplexHeatmap))
library(leaflet)

my.breaks <- c(seq(0, max(res_mtx_sum, na.rm = T), by=1))
my.colors <- colorBin("Spectral", bins = my.breaks, na.color = "#aaff56", reverse = T, pretty = T)

draw_colnames_45 <- function (coln, gaps, ...) {
    coord <- pheatmap:::find_coordinates(length(coln), gaps)
    x     <- coord$coord - 0.5 * coord$size
    res   <- grid::textGrob(
      coln, x = x, y = unit(1, "npc") - unit(3,"bigpts"),
      vjust = 0.75, hjust = 1, rot = 90, gp = grid::gpar(...)
    )
    return(res)
}
assignInNamespace(
  x = "draw_colnames",
  value = "draw_colnames_45",
  ns = asNamespace("pheatmap")
)

(p_heat = Heatmap(res_mtx_sum, 
                  cluster_columns = F, cluster_rows = F, 
        heatmap_legend_param = list(title = "# modules Zsum<2"),
        row_names_gp = gpar(fontsize = 10), column_names_gp = gpar(fontsize = 10),
        column_title = "Test set", 
        row_title = "Reference set",
        col = my.colors(my.breaks), 
        rect_gp = gpar(col = "gray", lwd = .8),
        cell_fun = function(j, i, x, y, w, h, col) { # add text to each grid
            grid.text(sprintf("%d", res_mtx_sum[i,j]), x, y, gp = gpar(fontsize = 8))
          }
        ))

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Stream 8
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] leaflet_2.1.1               ComplexHeatmap_2.11.1      
##  [3] SummarizedExperiment_1.24.0 Biobase_2.54.0             
##  [5] GenomicRanges_1.46.1        GenomeInfoDb_1.30.1        
##  [7] IRanges_2.28.0              S4Vectors_0.32.3           
##  [9] BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
## [11] matrixStats_0.61.0          ggeasy_0.1.3               
## [13] ggpubr_0.4.0                ggrepel_0.9.1              
## [15] WGCNA_1.70-3                fastcluster_1.2.3          
## [17] dynamicTreeCut_1.63-1       forcats_0.5.1              
## [19] stringr_1.4.0               dplyr_1.0.8                
## [21] purrr_0.3.4                 readr_2.1.2                
## [23] tidyr_1.2.0                 tibble_3.2.1               
## [25] ggplot2_3.4.1               tidyverse_1.3.1            
## [27] limma_3.50.1               
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.4.1        circlize_0.4.14       
##   [4] Hmisc_4.6-0            splines_4.1.2          crosstalk_1.2.0       
##   [7] digest_0.6.29          foreach_1.5.2          htmltools_0.5.2       
##  [10] GO.db_3.14.0           fansi_1.0.4            magrittr_2.0.3        
##  [13] checkmate_2.0.0        memoise_2.0.1          cluster_2.1.2         
##  [16] doParallel_1.0.17      tzdb_0.2.0             Biostrings_2.62.0     
##  [19] modelr_0.1.8           jpeg_0.1-9             colorspace_2.0-3      
##  [22] blob_1.2.2             rvest_1.0.2            haven_2.4.3           
##  [25] xfun_0.29              crayon_1.5.0           RCurl_1.98-1.6        
##  [28] jsonlite_1.7.3         impute_1.68.0          survival_3.2-13       
##  [31] iterators_1.0.14       glue_1.6.2             gtable_0.3.0          
##  [34] zlibbioc_1.40.0        XVector_0.34.0         GetoptLong_1.0.5      
##  [37] DelayedArray_0.20.0    car_3.0-12             shape_1.4.6           
##  [40] abind_1.4-5            scales_1.2.1           pheatmap_1.0.12       
##  [43] DBI_1.1.2              rstatix_0.7.0          Rcpp_1.0.8            
##  [46] htmlTable_2.4.0        clue_0.3-60            foreign_0.8-81        
##  [49] bit_4.0.4              preprocessCore_1.56.0  Formula_1.2-4         
##  [52] DT_0.20                htmlwidgets_1.5.4      httr_1.4.2            
##  [55] RColorBrewer_1.1-2     ellipsis_0.3.2         pkgconfig_2.0.3       
##  [58] farver_2.1.0           nnet_7.3-16            sass_0.4.0            
##  [61] dbplyr_2.1.1           utf8_1.2.3             tidyselect_1.1.2      
##  [64] labeling_0.4.2         rlang_1.1.1            AnnotationDbi_1.56.2  
##  [67] munsell_0.5.0          cellranger_1.1.0       tools_4.1.2           
##  [70] cachem_1.0.6           cli_3.6.1              generics_0.1.2        
##  [73] RSQLite_2.2.10         broom_0.7.12           evaluate_0.15         
##  [76] fastmap_1.1.0          yaml_2.3.5             knitr_1.37            
##  [79] bit64_4.0.5            fs_1.5.2               KEGGREST_1.34.0       
##  [82] xml2_1.3.3             compiler_4.1.2         rstudioapi_0.13       
##  [85] png_0.1-7              ggsignif_0.6.3         reprex_2.0.1          
##  [88] bslib_0.3.1            stringi_1.7.6          highr_0.9             
##  [91] lattice_0.20-45        Matrix_1.5-1           vctrs_0.6.3           
##  [94] pillar_1.9.0           lifecycle_1.0.3        jquerylib_0.1.4       
##  [97] GlobalOptions_0.1.2    data.table_1.14.2      bitops_1.0-7          
## [100] R6_2.5.1               latticeExtra_0.6-29    gridExtra_2.3         
## [103] codetools_0.2-18       assertthat_0.2.1       rjson_0.2.21          
## [106] withr_2.5.0            GenomeInfoDbData_1.2.7 parallel_4.1.2        
## [109] hms_1.1.1              rpart_4.1-15           rmarkdown_2.11        
## [112] carData_3.0-5          lubridate_1.8.0        base64enc_0.1-3